rm(list=ls(all=TRUE))
pacman::p_load(vcd, magrittr, readr, caTools, ggplot2, dplyr, plotly, ggrepel)
load("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
##   cust    tid  items 
##  32241 119328 817182
每週交易次數圖表
Z0$date = as.Date(Z0$date, format="%m/%d/%Y")
hist(Z0$date,'weeks',freq=T,las=2)


主要發現:

● 主要發現一月銷售量相較於其他月份銷售量更突出,推估為過年買氣較旺,可根據此檔期做行銷策略專案。
● 從圖中可發現12月的第四周的交易量較少,可以推測該週可能是因為店內整修,有部分天數店休,導致交易量驟降。

年齡層級、郵遞區號

ggplot(Z0) +geom_bar(aes(x = age, fill = age))

各地區顧客分布
options(scipen=999)#不要科學記號
ggplot(Z0) +geom_bar(aes(x = area, fill = area))

我們得知Ta Feng量販店的會員集中在汐止區與南港區,因此推估商店舖座落在兩區之間。
假設大豐超市位於南港區global mall商城內的國際商品生鮮超市,
引進進口產品,提高高質量零食與生鮮蔬果商品。
超市會員主要多為30-50歲青、壯年年齡居多,推估此年齡層有一定的基本經濟能力。

年齡與地理區隔的關聯性
MOSA = function(formula, data) mosaic(formula, data, shade=T, 
  margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
  gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
  gp_text=gpar(fontsize=7),labeling=labeling_residuals)

MOSA(~area+age, A0)


主要發現:

● 主要顧客來源為南港區及汐止區,信義區及內湖區為其次
● 顧客中30~44歲的年齡層較多
● 南港區較多25歲以下學生族群,較少30~40歲壯年客群
● 汐止區30~40歲的顧客比率比較高



各區年齡層的消費時間
X0$wday = format(X0$date, "%u")
ht <- count(X0, age, wday)
X0A <- merge(X0,ht)

ggplot(X0A, aes(X0A$wday,X0A$age)) + 
    geom_tile(aes(fill = n),colour = "white")
## Warning: Use of `X0A$wday` is discouraged. Use `wday` instead.
## Warning: Use of `X0A$age` is discouraged. Use `age` instead.

    scale_fill_gradient(low = "white",high = "steelblue", limits = c(0, 6000))
## <ScaleContinuous>
##  Range:  
##  Limits:    0 -- 6e+03

假日的各年齡層購買力較平日強,尤其30~39歲族群最顯著

簡單泡泡圖

年齡區隔特徵
A0 %>% group_by(age) %>% summarise(
  Group.Size = n(),              # 族群人數
  avg.Freq = mean(f),            # 平均購買次數
  avg.Revenue = sum(f*m)/sum(f)  # 平均客單價
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=age)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") + 
  ylab("平均購買次數") + xlab("平均客單價")

mean(A0$age == "a99")
## [1] 0.01941627

由於a99(沒有年齡資料的顧客)人數不多,而且特徵很獨特,探索時我們可以考慮濾掉這群顧客

A0 %>% filter(age!="a99") %>%    # 濾掉沒有年齡資料的顧客('a99')
  group_by(age) %>% summarise(
  Group.Size = n(),              # 族群人數
  avg.Freq = mean(f),            # 平均購買次數
  avg.Revenue = sum(f*m)/sum(f)  # 平均客單價
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=age)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") + 
  ylab("平均購買次數") + xlab("平均客單價")

地理區隔特徵
A0 %>% filter(age!="a99") %>%    # 濾掉沒有年齡資料的顧客('a99')
  group_by(area) %>% summarise(
  Group.Size = n(),              # 族群人數
  avg.Freq = mean(f),            # 平均購買次數
  avg.Revenue = sum(f*m)/sum(f)  # 平均客單價
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=area)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("地理區隔特徵 (泡泡大小:族群人數)") + 
  ylab("平均購買次數") + xlab("平均客單價")

情境推測:
1. 假設Ta Feng為位於南港區的購物中心內的量販店
2. 住較近學生及上班族群下班下課後的聚會地點
3. 因地緣關係,信義區跟內湖區等顧客可能只在假日特地來購物商城逛街補貨,因此單次購買量較高
4. 30~50歲平均購買單價較其他年齡層高,推測經濟能力較好

產品銷售狀況

prodA = Z0 %>% group_by(prod) %>% summarise(
  totalQty = sum(qty),
  totalRev = sum(price),
  totalGross = sum(price) - sum(cost),
  grossMargin = totalGross/totalRev,
  avgPrice = totalRev/totalQty
  )
prodA
## # A tibble: 23,789 x 6
##    prod          totalQty totalRev totalGross grossMargin avgPrice
##    <chr>            <dbl>    <dbl>      <dbl>       <dbl>    <dbl>
##  1 0002884011363        1      279         79       0.283    279  
##  2 0008635012177      167    18474       3611       0.195    111. 
##  3 0008635099680       30     2710        580       0.214     90.3
##  4 0010742201412       27     2101        400       0.190     77.8
##  5 0010742201610       30     2316        426       0.184     77.2
##  6 0010742201719       17     1331        260       0.195     78.3
##  7 0010742201818       21     1651        328       0.199     78.6
##  8 0010742202112        3      237         48       0.203     79  
##  9 0010742202211       28     2172        408       0.188     77.6
## 10 0010742206615       35     2675        470       0.176     76.4
## # ... with 23,779 more rows
prod1 = Z0 %>% group_by(prod) %>% summarise(
  noProd = n_distinct(prod),
  totalQty = sum(qty),
  totalRev = sum(price),
  totalGross = sum(price) - sum(cost),
  grossMargin = totalGross/totalRev,
  avgPrice = totalRev/totalQty
  )
prod1 <- prod1 %>% arrange(desc(totalQty,avgPrice)) %>% head(10)
ggplot(prod1, aes(x = avgPrice, y = totalQty))+geom_point(aes(col = totalGross,size = totalRev))+scale_color_gradientn(colors=c("seagreen","gold","red"))+geom_text_repel(aes(avgPrice, totalQty, label = prod))

top2000產品金額與營收
top2000 = prodA %>% top_n(2000, totalRev)
g1=ggplot(top2000, aes(x=totalRev, y=avgPrice, col=prod)) +
  geom_point()
ggplotly(g1)

我們發現商品平均售價不高,同時我們也可以發現最暢銷的產品不一定會賺錢,因此大豐超市需要推廣更多毛利較高的產品,才能創造更好的營收。

產品資訊
cats = Z0 %>% group_by(cat) %>% summarise(
  noProd = n_distinct(prod),
  totalQty = sum(qty),
  totalRev = sum(price),
  totalGross = sum(price) - sum(cost),
  grossMargin = totalGross/totalRev,
  avgPrice = totalRev/totalQty
  )
#商品探索圖
top_cat <- cats %>% arrange(desc(totalGross)) %>% head(10)
top_cat$cat <- as.factor(top_cat$cat)
ggplot(top_cat, aes(x=cat, y=avgPrice, fill=cat))+geom_col()

test2 <- Z0 %>% filter(age %in% c("a34","a39","a44","a49"))
test3 <- test2 %>%
  group_by(prod) %>%
  summarise(prod_qty = sum(qty)) %>%
  arrange(desc(prod_qty)) 
test4 <-merge(test2,test3)
購買次數前10品項在30-50歲的購買狀況
t1 <-test3 %>% head(10)
t2 <- merge(test2,t1)
t2$cat <- as.factor(t2$cat)
ggplot(t2, aes(x=age, y=qty, fill=cat)) + geom_col()

總毛利最好的前50項品項
# 對品類(`category`)做彙總 
cattest = Z0 %>%                        
  group_by(cat) %>% #根據每個品項每月會有一個值
  summarise(                           
    totalQty = sum(qty),
    totalRev = sum(price),
    totalGross = sum(price) - sum(cost),
    grossMargin = totalGross/totalRev,
    avgPrice = totalRev/totalQty
  ) %>% 
  arrange(cat)            
cattest2 = cattest %>% as.data.frame
cattest2 = cattest2 %>% arrange(desc(totalGross)) %>% head(50)
cattest2$cat <- as.factor(cattest2$cat)
a=qplot(x = avgPrice,
        y = totalGross ,
        data= cattest2,
        color = cat,
        size = totalGross)
options(scipen=999)#不要科學記號
ggplotly(a)

320402為單價高但毛利最好的產品

營收貢獻(rev)最大的100個品類與平均價格
col6 = c('seagreen','gold','orange',rep('red',3))
gg2= group_by(Z0, cat) %>% summarise(
  solds = n(), qty = sum(qty), rev = sum(price), cost = sum(cost), 
  profit = rev - cost, margin = 100*profit/rev , avg_price = rev/qty
  ) %>% 
  top_n(100, profit) %>% 
  ggplot(aes(x=margin, y=rev, col=profit, label=cat, label2=avg_price)) + 
  geom_point(size=2,alpha=0.8) + scale_y_log10() + 
  scale_color_gradientn(colors=col6) +
  theme_bw()
ggplotly(gg2)

探查此區高毛利率但營收不高的前五名品類 #501002,501001,500903,500705,560336

以下為cat5,毛利率前幾名相關資訊
catA1 = subset(Z0, cat=="501001") #篩選出501001品類資料
catA2 = catA1 %>% group_by(tid) %>% summarise(
  date = date[1],             # 交易日期  
  cust = cust[1],             # 顧客 ID
  age = age[1],               # 顧客 年齡級別
  area = area[1],             # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame            
nrow(catA2)   
## [1] 1112
sapply(catA2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值
##        items  pieces    total    gross
## 99.9%      4 18.2230 1788.204 464.2390
## 99.95%     4 19.4445 1889.446 533.8955
## 99.99%     4 19.8889 1897.889 586.7791
catA2 = subset(catA2, items<=4 & pieces<19.889 & total<1897.9,gross<586.78) 
par(cex=0.8)
hist(catA2$date, "weeks", freq=T, las=2, main="cat501001 per Week")

X0$wday = format(X0$date, "%u")  #cat501001購買年齡層與週間
catA3 <- merge(catA2,X0)
catA31 <- count(catA3, age, wday)
catA4<- merge(catA31,catA3)

A4=ggplot(catA4, aes(catA4$wday,catA4$age)) + 
    geom_tile(aes(fill = n),colour = "white")+
    scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(-10,10 ))+ theme_bw()
A4
## Warning: Use of `catA4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catA4$age` is discouraged. Use `age` instead.

cat501001在週六的購買情況較好

catB1 = subset(Z0, cat=="501002") #篩選出501002品類資料
catB2 = catB1 %>% group_by(tid) %>% summarise(
  date = date[1],             # 交易日期  
  cust = cust[1],             # 顧客 ID
  age = age[1],               # 顧客 年齡級別
  area = area[1],             # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame            
nrow(catB2)   
## [1] 1298
sapply(catB2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值
##        items pieces    total    gross
## 99.9%      3      6 788.5860 228.4360
## 99.95%     3      6 841.7985 252.7385
## 99.99%     3      6 893.1597 283.3477
catB2 = subset(catB2, items<=3 & pieces<6 & total<893.16,gross<283.35) 
par(cex=0.8)
hist(catB2$date, "weeks", freq=T, las=2, main="cat501002 per Week")

X0$wday = format(X0$date, "%u")  #cat501002購買年齡層與週間
catB3 <- merge(catB2,X0)
catB31 <- count(catB3, age, wday)
catB4<- merge(catB31,catB3)

B4=ggplot(catB4, aes(catB4$wday,catB4$age)) + 
    geom_tile(aes(fill = n),colour = "white")+
    scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(0,5 ))+ theme_bw()
B4
## Warning: Use of `catB4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catB4$age` is discouraged. Use `age` instead.

cat501002在週一及週三的購買情況較好

catC1 = subset(Z0, cat=="500903") #篩選出500903品類資料
catC2 = catC1 %>% group_by(tid) %>% summarise(
  date = date[1],             # 交易日期  
  cust = cust[1],             # 顧客 ID
  age = age[1],               # 顧客 年齡級別
  area = area[1],             # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame            
nrow(catC2)   
## [1] 1024
sapply(catC2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值
##         items  pieces    total    gross
## 99.9%  3.0000  8.0000 2367.713  678.297
## 99.95% 3.4885 15.8160 2942.005  880.331
## 99.99% 3.8977 22.3632 3404.401 1046.466
catC2 = subset(catC2, items<=3.8977 & pieces<=22.363 & total<3404.4,gross<1046.47) 
par(cex=0.8)
hist(catC2$date, "weeks", freq=T, las=2, main="cat500903 per Week")

X0$wday = format(X0$date, "%u")  #cat500903購買年齡層與週間
catC3 <- merge(catC2,X0)
catC31 <- count(catC3, age, wday)
catC4<- merge(catC31,catC3)

C4=ggplot(catC4, aes(catC4$wday,catC4$age)) + 
    geom_tile(aes(fill = n),colour = "white")+
    scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(0,5 ))+ theme_bw()
C4
## Warning: Use of `catC4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catC4$age` is discouraged. Use `age` instead.

catD1 = subset(Z0, cat=="500705") #篩選出500705品類資料
catD2 = catD1 %>% group_by(tid) %>% summarise(
  date = date[1],             # 交易日期  
  cust = cust[1],             # 顧客 ID
  age = age[1],               # 顧客 年齡級別
  area = area[1],             # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame            
nrow(catD2)   
## [1] 525
sapply(catD2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值
##         items pieces    total    gross
## 99.9%  5.4760 29.520 4739.436 1365.768
## 99.95% 5.7380 34.760 5580.718 1553.884
## 99.99% 5.9476 38.952 6253.744 1704.377
catD2 = subset(catD2, items<=5.9476 & pieces<=38.95 & total<6253.7,gross<1704.4) 
par(cex=0.8)
hist(catD2$date, "weeks", freq=T, las=2, main="cat500705 per Week")

X0$wday = format(X0$date, "%u")  #cat500705購買年齡層與週間
catD3 <- merge(catD2,X0)
catD31 <- count(catD3, age, wday)
catD4<- merge(catD31,catD3)

D4=ggplot(catD4, aes(catD4$wday,catD4$age)) + 
    geom_tile(aes(fill = n),colour = "white")+
    scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(0,5 ))+ theme_bw()
D4
## Warning: Use of `catD4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catD4$age` is discouraged. Use `age` instead.

catE1 = subset(Z0, cat=="560336") #篩選出560336品類資料
catE2 = catE1 %>% group_by(tid) %>% summarise(
  date = date[1],             # 交易日期  
  cust = cust[1],             # 顧客 ID
  age = age[1],               # 顧客 年齡級別
  area = area[1],             # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame            
nrow(catE2)   
## [1] 1138
sapply(catE2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值
##         items pieces     total    gross
## 99.9%  3.0000 5.8630  580.7670 162.9450
## 99.95% 3.4315 6.8630  790.4145 201.6775
## 99.99% 3.8863 7.7726 1010.0829 240.3355
catE2 = subset(catE2, items<=3.88 & pieces<=7.77 & total<1010.08,gross<240.34) 
par(cex=0.8)
hist(catE2$date, "weeks", freq=T, las=2, main="cat560336 per Week")

X0$wday = format(X0$date, "%u")  #cat560336購買年齡層與週間
catE3 <- merge(catE2,X0)
catE31 <- count(catE3, age, wday)
catE4<- merge(catE31,catE3)

E4=ggplot(catE4, aes(catE4$wday,catE4$age)) + 
    geom_tile(aes(fill = n),colour = "white")+
    scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(0,5 ))+ theme_bw()
E4
## Warning: Use of `catE4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catE4$age` is discouraged. Use `age` instead.

購物籃分析

pacman::p_load(Matrix, arules, arulesViz)
Z0$tid %>% n_distinct
## [1] 119422
Z0$cat %>% n_distinct
## [1] 2007
p = count(Z0, cat, sort=T)
pk = p$cat[1:2007]
Z = filter(Z0, cat %in% pk)
tr = as(split(Z[,"cat"], Z[,"tid"]), "transactions"); tr
## Warning in asMethod(object): removing duplicated items in transactions
## transactions in sparse format with
##  119422 transactions (rows) and
##  2007 items (columns)
rules <- apriori(tr, parameter=list(supp=0.00005, conf=0.5))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5 0.00005      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 5 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2007 item(s), 119422 transaction(s)] done [0.16s].
## sorting and recoding items ... [1741 item(s)] done [0.01s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [2.42s].
## writing ... [584757 rule(s)] done [0.25s].
## creating S4 object  ... done [0.31s].
summary(rules)
## set of 584757 rules
## 
## rule length distribution (lhs + rhs):sizes
##      2      3      4      5      6      7      8 
##     55  14701 219926 269094  73683   7050    248 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   4.000   5.000   4.725   5.000   8.000 
## 
## summary of quality measures:
##     support             confidence        coverage               lift         
##  Min.   :0.00005024   Min.   :0.5000   Min.   :0.00005024   Min.   :   4.068  
##  1st Qu.:0.00005024   1st Qu.:0.5455   1st Qu.:0.00007536   1st Qu.:   6.358  
##  Median :0.00005862   Median :0.6000   Median :0.00009211   Median :   9.537  
##  Mean   :0.00006622   Mean   :0.6470   Mean   :0.00010729   Mean   :  13.516  
##  3rd Qu.:0.00006699   3rd Qu.:0.7368   3rd Qu.:0.00011723   3rd Qu.:  13.897  
##  Max.   :0.00617139   Max.   :1.0000   Max.   :0.01216694   Max.   :2985.550  
##      count        
##  Min.   :  6.000  
##  1st Qu.:  6.000  
##  Median :  7.000  
##  Mean   :  7.908  
##  3rd Qu.:  8.000  
##  Max.   :737.000  
## 
## mining info:
##  data ntransactions support confidence
##    tr        119422 0.00005        0.5

我們運用購物籃分析法,尋找消費者

cat5 <- c("501002","500705","560336","501001","500903")
rx = subset(rules, subset = lift > 5 & count > 7  & rhs %in% cat5) 
inspect(rx)
##      lhs                       rhs      support       confidence coverage    
## [1]  {560351}               => {560336} 0.00032657299 0.5000000  0.0006531460
## [2]  {560201,560327}        => {560336} 0.00006698933 0.5000000  0.0001339787
## [3]  {560104,560351}        => {560336} 0.00006698933 0.6666667  0.0001004840
## [4]  {560351,560402}        => {560336} 0.00007536300 0.7500000  0.0001004840
## [5]  {560201,560351}        => {560336} 0.00012560500 0.5172414  0.0002428363
## [6]  {560334,570311}        => {560336} 0.00006698933 0.6153846  0.0001088577
## [7]  {560316,570206}        => {560336} 0.00007536300 0.5625000  0.0001339787
## [8]  {560316,560334}        => {560336} 0.00008373666 0.5000000  0.0001674733
## [9]  {560316,560339}        => {560336} 0.00007536300 0.5294118  0.0001423523
## [10] {560337,570206}        => {560336} 0.00008373666 0.5263158  0.0001590997
## [11] {560337,570306}        => {560336} 0.00006698933 0.5000000  0.0001339787
## [12] {560334,560337}        => {560336} 0.00008373666 0.5000000  0.0001674733
## [13] {560330,560335}        => {560336} 0.00007536300 0.6000000  0.0001256050
## [14] {560322,560334}        => {560336} 0.00007536300 0.5000000  0.0001507260
## [15] {100212,501001}        => {501002} 0.00006698933 0.5333333  0.0001256050
## [16] {560201,560334,570306} => {560336} 0.00006698933 0.6153846  0.0001088577
## [17] {530110,560201,560339} => {560336} 0.00006698933 0.5000000  0.0001339787
##      lift     count
## [1]  52.47012 39   
## [2]  52.47012  8   
## [3]  69.96016  8   
## [4]  78.70518  9   
## [5]  54.27944 15   
## [6]  64.57861  8   
## [7]  59.02889  9   
## [8]  52.47012 10   
## [9]  55.55660  9   
## [10] 55.23171 10   
## [11] 52.47012  8   
## [12] 52.47012 10   
## [13] 62.96415  9   
## [14] 52.47012  9   
## [15] 49.06913  8   
## [16] 64.57861  8   
## [17] 52.47012  8

我們希望提高毛利率高但銷售量少的商品的購買率,
因此藉由購物籃的分析法尋找商品間的關聯性,
我們發現購買#560201可以提高#560336的購買率,
而購買#501002可以提高#5601001的購買率,於是可以用在我們的行銷策略上。